중첩 반복문 튜닝

밀집 행렬의 곱셈

const unsigned s=128;

matrix<float> A(s, s), B(s, s), C(s, s);

for(unsigned i=0; i<s; ++i){

for(unsigned j=0; j<s; j++){

A(i, j)=100.0*i+j;

B(i, j)=200.0*i+j;

}

mult(A, B, C);

행렬 곱셈을 위한 mult 템플릿 표현식

template <typename Matrix>

inline void mult(const Matrix& A, const Matrix& B, Matrix& C){

assert(A.num_rows()==B.num_rows());

typedef typename Matrix::value_type value_type;

unsigned s=A.num_rows();

for(unsigned i=0; i<s; ++i){

for(unsigned k=0; k<s; k++){

value_type tmp(0);

for(unsigned j=0; j<s; j++) tmp+=A(i, j*B(j, k));

C(i, k)=tmp;

}

mult을 수행하기 위해 세번의 중첩된 반복문을 사용한다.

성능 향상을 위해서 밖의 외부의 2개의 반복문을 메타 튜닝을 이용해서 풀어내고,

내부 반복문에서 블록 연산을 수행한다. 결과를 담는 C는 임시변수를 사용한다.(mult_tmp)

template <unsigned BSize, typename Value>

struct multi_tmp{

typedef multi_tmp<BSize-1, Value> sub_type;

multi_tmp(const Value& v):value(v), sub(v) {}

Value value;

sub_type sub;

Value sum() const { return value+sub.sum(); }

};

template <typename Value>

struct multi_tmp<0, Value>{

multi_tmp(const Value& v) {}

Value sum() const { return 0; }

};

template <unsigned Index0, unsigned Max0, unsigned Index1, unsigned Max1>

struct mult_block{

typedef mult_block<Index0, Max0, Index1+1, Max1> next;

template <typename Tmp, typename Matrix>

void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){

tmp.value+=A(i+Index0, j)*B(j, k+Index1);

next()(tmp.sub, A, B, i, j, k);

}

template <typename Tmp, typename Matrix>

void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){

C(i+Index0, k+Index1)=tmp.value;

next().update(tmp.sub, C, i, k);

}

};

template <unsigned Index0, unsigned Max0, unsigned Max1>

struct mult_block<Index0, Max0, Max1, Max1>{

typedef mult_block<Index0+1, Max0, 0, Max1> next;

template <typename Tmp, typename Matrix>

void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){

tmp.value+=A(i+Index0, j)*B(j, k+Max1);

next()(tmp.sub, A, B, i, j, k);

}

template <typename Tmp, typename Matrix>

void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){

C(i+Index0, k+Max1)=tmp.value;

next().update(tmp.sub, C, i, k);

}

};

template <unsigned Max0, unsigned Max1>

struct mult_block<Max0, Max0, Max1, Max1>{

template <typename Tmp, typename Matrix>

void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){

tmp.value+=A(i, Max0, j)*B(j, k+Max1);

}

template <typename Tmp, typename Matrix>

void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){

C(i+Max0, k+Max1)=tmp.value;

}

};

template <unsigned Size0, unsigned Size1, typename Matrix>

inline void mult(const Matrix& A, const Matrix& B, Matrix& C){

using value_type=typename Matrix::value_type;

unsigned s=A.num_rows();

mult_block<0, Size0-1, 0, Size1-1> block;

for(unsigned int i=0; i<s; i+=Size0){

for(unsigned int k=0; k<s; k+=Size1){

mult_tmp<Size0*Size1, value_type> tmp(value_type(0));

for(unsigned j=0; j<s; j++) block(tmp, A, B, i ,j, k);

block.update(tmp, C, i, k);

}

단순화(loop2)

재사용할 가능성 증가(첫 번째 특수화를 생략해도 괜찮음)

template <unsigned Index0, unsigned Max0, unsigned Index1, unsigned Max1>

struct loop2{

static const unsigned next_index0=Index0, next_index1=Index1+1;

};

template <unsigned Index0, unsigned Max0, unsigned Max1>

struct loop2<Index0, Max0, Max1, Max1>{

static const unsigned next_index0=Index0+1, next_index1=0;

};

// loop2와 결합(첫 번째 특수화 생략 가능)

template <unsigned Index0, unsigned Max0, unsigned Index1, unsigned Max1>

struct mult_block{

typedef loop2<Index0, Max0, Index1, Max1> l;

typedef mult_block<l::next_index0, Max0, l::next_index1, Max1> next;

template <typename Tmp, typename Matrix>

void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){

tmp.value+=A(i, Index0, j)*B(j, k+Index1);

next()(tmp.sub, A, B, i, j, k);

}

template <typename Tmp, typename Matrix>

void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){

C(i+Index0, k+Index1)=tmp.value;

next().update(tmp.sub, C, i, k);

}

};

// 이하 특수화는 동일

Meta-Tunning 4